library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(viridis)
## Loading required package: viridisLite
library(ggridges)
library(patchwork)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)
importing_data = function(x){
if(str_detect(x, str_c(years_1, collapse = "|"))) {
read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc")
}
else if(str_detect(x, str_c(years_2, collapse = "|"))){
read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
}
}
boston_df <-
tibble(list.files("data", full.names = TRUE)) %>%
setNames("file_name") %>%
mutate(data = map(file_name, importing_data)) %>%
unnest(data) %>%
mutate(year = readr::parse_number(file_name),
city = coalesce(city, residence),
display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>%
filter(!is.na(display_name)) %>%
select(-file_name, -residence, -first_name, -last_name)
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
boston_df_age_plot =
boston_df %>%
select(age) %>%
drop_na(age)
This is plotly:
age_plotly_original =
boston_df_age_plot %>%
count(age) %>%
plot_ly(
x = ~age, y = ~n, color = ~age,
type = "bar", colors = "viridis") %>%
layout(
xaxis = list(title = "Age"),
yaxis = list(title = "Number of Participants")
)
This is plain ggplot:
age_ggplot =
ggplot(boston_df_age_plot, aes(x = age)) +
geom_bar(fill = "cornflowerblue",
color = "white") +
labs(title = "Participants by age",
subtitle = "binwidth = 5 years",
x = "Age", y = "Number of participants")
This is plotly from ggplot:
ggplotly(age_ggplot)
age_percentage_ggplot =
ggplot(boston_df_age_plot,
aes(x = age,
y = ..count.. / sum(..count..))) +
geom_histogram(fill = "cornflowerblue",
color = "white",
binwidth = 5) +
labs(title = "Participants by age",
y = "Percent",
x = "Age") +
scale_y_continuous(labels = percent)
plotly:
ggplotly(age_percentage_ggplot)
density_plot =
boston_df_age_plot %>%
ggplot(aes(x = age)) +
geom_density(fill = "cornflowerblue", color = "black", alpha = 0.8) +
ggtitle("Distribution of age")
ggplotly(density_plot)
boston_df_gender =
boston_df %>%
select(gender) %>%
mutate(gender = na_if(gender, "U"),
gender = recode(gender, m = "M"),
gender = factor(gender, levels = c("M", "F"), labels = c("male", "female"))) %>%
drop_na(gender) %>%
count(gender)
bar_graph_gender = boston_df_gender %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
# plot the bars as percentages,
# in decending order with bar labels
ggplot(bar_graph_gender,
aes(x = reorder(gender, -pct),
y = pct)) +
geom_bar(stat = "identity",
fill = "indianred3",
color = "black") +
geom_text(aes(label = pctlabel),
vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(x = "Gender",
y = "Percent",
title = "Participants by gender")
bar_graph_frequency =
ggplot(bar_graph_gender,
aes(x = gender,
y = n)) +
geom_bar(stat = "identity",
fill = "indianred3",
color = "black") +
geom_text(aes(label = n),
vjust = -0.5) +
scale_y_continuous(labels = scales::comma) +
labs(x = "Gender",
y = "Frequency",
title = "Participants by gender")
boston_df_gender_time_plot =
boston_df %>%
select(year, gender) %>%
mutate(gender = na_if(gender, "U"),
gender = recode(gender, m = "M"),
gender = factor(gender, levels = c("M", "F"), labels = c("male", "female"))) %>%
drop_na(gender) %>%
drop_na(year) %>%
group_by(year, gender) %>%
count()
gender_time_plot =
ggplot(boston_df_gender_time_plot, aes(x = year, y = n)) +
geom_line(aes(color = gender), size = 1) + xlim(1960, 2019) + labs(x = "year",
y = "participants",
title = "Participants over time by gender")
ggplotly(gender_time_plot)